## summary of the output from the 1d model with estimated alpha
## (balanced data)
##
##

library(coda)
load("pairData.Rda")


load("out-1d-alpha-nofraud-balanced-newIDs.Rda")
out <- out.1d.alpha

thetas <- out[, grep("theta", colnames(out))]
alphas <- out[, grep("alpha", colnames(out))] 

M <- nrow(out) ## number of MCMC draws

## calculate respondent-statement-specific in-sample prediction error rates
pairData.aug <- pairData
pairData.aug$post.prob.correct <- NA
for (i in 1:nrow(pairData.aug)){
    theta.vec.s1 <- thetas[,paste("theta",
                                  pairData.aug$statementID1[i], sep=".")]
    theta.vec.s2 <- thetas[,paste("theta",
                                  pairData.aug$statementID2[i], sep=".")]
    alpha.vec <- alphas[,paste("alpha", 
                                  pairData.aug$respondent[i], sep=".")]

    if (pairData.aug$choice[i] == pairData.aug$statementID1[i]){
        pairData.aug$post.prob.correct[i] = mean(pnorm(alpha.vec*(
            theta.vec.s1 - theta.vec.s2)))
    }
    if (pairData.aug$choice[i] == pairData.aug$statementID2[i]){
        pairData.aug$post.prob.correct[i] = mean(pnorm(alpha.vec*(
            theta.vec.s2 - theta.vec.s1)))
    }

    if (i %% 1000 == 0){
        cat("i =", i, "  of", nrow(pairData.aug), "\n")
    }
}


#> mean(pairData.aug$post.prob.correct)
#[1] 0.5509023
#> median(pairData.aug$post.prob.correct)
#[1] 0.5215582
#> mean(pairData.aug$post.prob.correct < 0.5)
#[1] 0.3547868



hist(pairData.aug$post.prob.correct, nclass=100, col="dodgerblue")


## aggregate to respondent
resp.id <- sort(unique(pairData.aug$respondent))
n.resp <- length(resp.id)
resp.post.prob.correct <- rep(NA, n.resp)

for (i in 1:n.resp){
    data.sub <- pairData.aug[pairData.aug$respondent == resp.id[i],]
    resp.post.prob.correct[i] <- mean(data.sub$post.prob.correct)
}


## by construction, the model does better than chance for every respondent
## > mean(resp.post.prob.correct < 0.5)
## [1] 0
##
## However, it doesn't do much better than chance for most respondents
## > mean(resp.post.prob.correct < 0.53)
## [1] 0.5284243
##>  mean(resp.post.prob.correct > 0.6)
##[1] 0.1892407

hist(resp.post.prob.correct, nclass=100, col=rgb(0, 0.4, 0.7))



## aggregate to statement
statement.id <- sort(unique(c(pairData.aug$statementID1,
                              pairData.aug$statementID2)))
n.statements <- length(statement.id)
statement.post.prob.correct <- rep(NA, n.statements)

for (i in 1:n.statements){
    data.sub <- pairData.aug[pairData.aug$statementID1 == statement.id[i] |
                            pairData.aug$statementID2 == statement.id[i],]
    statement.post.prob.correct[i] <- mean(data.sub$post.prob.correct)
}



## Predictions aren't much better than chance at the statement level as well
##> mean(statement.post.prob.correct)
##[1] 0.5508973
##>  median(statement.post.prob.correct)
##[1] 0.541523
##> mean(statement.post.prob.correct > 0.6)
##[1] 0.04761905
##> sum(statement.post.prob.correct > 0.6)
##[1] 2
##> 2/42
##[1] 0.04761905

hist(statement.post.prob.correct, nclass=20, col="dodgerblue")






## compare rank order correlation of theta point estimates and objective truth
## to rank order correlation of theta point estimates and valence
##


statements <- read.csv("./selectedStatementsPunctuationFixed-balanced.csv", header=TRUE, stringsAsFactors=FALSE)

statements <- statements[statements$keep==1,]

valence <- statements$valence
valence[valence=="R"] <- 1
valence[valence=="N"] <- 0
valence[valence=="L"] <- -1
valence <- as.numeric(valence)

obj.truth.6pt <- rep(NA, nrow(statements))
obj.truth.6pt[statements$rating == "pants-fire"] <- 0
obj.truth.6pt[statements$rating == "FALSE"] <- 1
obj.truth.6pt[statements$rating == "barely-true"] <- 2
obj.truth.6pt[statements$rating == "half-true"] <- 3
obj.truth.6pt[statements$rating == "mostly-true"] <- 4
obj.truth.6pt[statements$rating == "TRUE"] <- 5


theta.hat <- colMeans(thetas)


print(cor(valence, theta.hat, method="spearman"))
print(cor(obj.truth.6pt, theta.hat, method="spearman"))


## the single latent dimension is more strongly correlated with partisanship
## than objective truth:
##
## > print(cor(valence, theta.hat, method="spearman"))
## [1] -0.7337712
## > print(cor(obj.truth.6pt, theta.hat, method="spearman"))
## [1] 0.3161964


alpha.hat <- apply(alphas, 2, mean)
hist(alpha.hat)

mean(alpha.hat > 0)
mean(alpha.hat < 0)

##> mean(alpha.hat > 0)
##[1] 0.6188478
##> mean(alpha.hat < 0)
##[1] 0.3811522
